#! /usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright (c) Huoty, All rights reserved
# Author: Huoty <sudohuoty@163.com>

"""系统守护程序"""

import os
import sys
import time
import fcntl
import signal
import logging
from multiprocessing import cpu_count
from argparse import ArgumentParser

import psutil
from psutil import Process, NoSuchProcess


log = logging.getLogger("sysguard")


def per_load():
    """系统负载与CPU核数的均值"""
    try:
        return os.getloadavg()[0] / cpu_count()
    except Exception as ex:
        log.warning(ex)
        return 0


def cpu_percent():
    """CPU使用百分比"""
    try:
        # 获取最近 1 秒时间内的 CPU 利用率
        return psutil.cpu_percent(interval=1)
    except ZeroDivisionError as ex:
        log.warning(ex)
        return float("inf")


def memory_percent():
    """内存使用百分比"""
    try:
        return psutil.virtual_memory().percent
    except ZeroDivisionError as ex:
        log.warning(ex)
        return float("inf")


def sys_elapsed_seconds():
    return time.time() - psutil.boot_time()


def send_signal(proc, sig):
    pid = proc if isinstance(proc, int) else proc.pid
    sig = getattr(signal, sig.upper()) if isinstance(sig, str) else sig
    os.kill(pid, sig)


class WrappedProcess(object):

    def __init__(self, proc):
        self._proc = Process(proc) if isinstance(proc, int) else proc

    def __getattr__(self, name):
        try:
            return getattr(self._proc, name)
        except NoSuchProcess:
            return None

    def cpu_percent(self, *args, **kwargs):
        try:
            return self._proc.cpu_percent(*args, **kwargs)
        except NoSuchProcess:
            return 0.0

    def mem_percent(self, *args, **kwargs):
        try:
            return self._proc.memory_percent(*args, **kwargs)
        except NoSuchProcess:
            return 0.0

    def cmdline(self, *args, **kwargs):
        try:
            return self._proc.cmdline(*args, **kwargs)
        except NoSuchProcess:
            return []

    @property
    def cmd(self):
        try:
            return ' '.join(self._proc.cmdline())
        except NoSuchProcess:
            return ''


def process_iter(*args, **kwargs):
    for proc in psutil.process_iter(*args, **kwargs):
        yield WrappedProcess(proc)


class SysGuard(object):

    def __init__(self, load_limit=1.2, cpu_limit=100, mem_limit=88,
                 lock_file=None, debug=False):
        self.load_limit = load_limit
        self.cpu_limit = cpu_limit
        self.mem_limit = mem_limit
        self.debug = debug

        # 锁文件及句柄
        self._lock_file = lock_file or self._get_default_lock_file()
        self._lock_handle = None

    def _get_default_lock_file(self):
        rundir = None
        for path in [
            os.path.join(os.getenv("HOME", "/root"), "./local/run"),
            os.path.join(os.getenv("HOME", "/root"), "run"),
            "/usr/local/var/run",
            "/var/run"
        ]:
            rundir = path
            if os.path.exists(path):
                break
        lock_file = os.path.join(rundir, "sysguard.lock")
        return lock_file

    def _has_enough_cpu_resources(self, log_if_no_enough=True):
        sys_per_load = per_load()
        sys_cpu_pct = cpu_percent()
        if sys_per_load < self.load_limit and sys_cpu_pct < self.cpu_limit:
            return True
        if log_if_no_enough:
            if sys_per_load >= self.load_limit:
                log.error("Host per load %.2f is too large", sys_per_load)
            if sys_cpu_pct >= self.cpu_limit:
                log.error("Host cpu usage %s%% is too large", sys_cpu_pct)
        return False

    def _has_enough_mem_resources(self, log_if_no_enough=True):
        mem_pct = memory_percent()
        if mem_pct < self.mem_limit:
            return True
        if log_if_no_enough:
            log.error("Host memory usage %s%% is too large", mem_pct)
        return False

    def handle_unsafe_process(self, proc, force_kill=False):
        if not proc.is_running():
            return 0
        if proc.mem_percent() < 1 and proc.cpu_percent(interval=0.5) < 5:
            return 0
        proc_desc = "Process(pid=%s, mem=%.2f%%, cpu=%.2f%%, cmd='%s')" % (
            proc.pid,
            proc.mem_percent(),
            proc.cpu_percent(interval=0.5),
            " ".join(proc.cmdline()).strip(),
        )
        log.warning("%s is unsafe", proc_desc)

        if proc.pid < 10:  # 忽略内核进程
            return 0

        sig = "SIGKILL" if force_kill else "SIGTERM"
        try:
            if not self.debug:
                send_signal(proc.pid, sig)
        except NoSuchProcess:
            return 0
        else:
            log.error("kill -%s %s", sig, proc_desc)
        return 1

    def check_memory(self):
        if self._has_enough_mem_resources(log_if_no_enough=True):
            return
        unsafe_processes = sorted(
            process_iter(), key=lambda proc: proc.mem_percent(), reverse=True
        )[:5]
        for proc in unsafe_processes:
            if (
                self.handle_unsafe_process(proc) == 1 and
                self._has_enough_mem_resources(log_if_no_enough=False)
            ):
                break

    def check_cpu_load(self):
        if self._has_enough_cpu_resources(log_if_no_enough=True):
            return
        unsafe_processes = sorted([
            (proc.cpu_percent(interval=0.5), proc) for proc in process_iter()
        ], key=lambda item: item[0], reverse=True)[:5]
        for cpu_pct, proc in unsafe_processes:
            if cpu_pct < 10:
                continue
            self.handle_unsafe_process(proc)
            if self._has_enough_cpu_resources(log_if_no_enough=False):
                break

    def acquire_lock(self):
        self._lock_handle = open(self._lock_file, 'a+')
        lock_cmd = fcntl.LOCK_EX | fcntl.LOCK_NB
        try:
            fcntl.lockf(self._lock_handle, lock_cmd)
        except IOError:
            return False
        return True

    def release_lock(self):
        if not os.path.exists(self._lock_file):
            raise Exception("No such lock _lock_file: %r" % self._lock_file)
        fp = self._lock_handle or open(self._lock_file, 'a+')
        fcntl.lockf(fp, fcntl.LOCK_UN)
        fp.close()
        self.lock_handle = None
        return True

    def check(self):
        self.check_memory()
        self.check_cpu_load()

    def __call__(self, interval=120):
        if not self.acquire_lock():
            log.debug("Can't acquire lock file %r, exit", self._lock_file)
            return

        log.info("Start running sysguard, lock file %r", self._lock_file)
        error_count = 0
        while True:
            try:
                if sys_elapsed_seconds() > 180:
                    self.check()
                time.sleep(interval)
            except Exception as ex:
                log.exception(ex)
                error_count += 1
                if error_count >= 10:
                    break

        self.release_lock()


def main():
    parser = ArgumentParser("系统守护程序")
    parser.add_argument("--cpu-limit", type=float, default=100,
                        help="CPU使用率上限")
    parser.add_argument("--mem-limit", type=float, default=88,
                        help="内存使用率上限")
    parser.add_argument("--load-limit", type=float, default=2.5,
                        help="系统平均单核负载上限")
    parser.add_argument("-C", "--continuous", action="store_true",
                        help="持续检查，即进程不退出")
    parser.add_argument("-I", "--interval", type=float, default=30,
                        help="持续检查时的间隔，单位为秒")
    parser.add_argument("-L", "--lock-file", help="锁文件")
    parser.add_argument("--loglevel", default="info", help="日志级别")
    parser.add_argument("--debug", action="store_true", help="调试模式")
    args = parser.parse_args()

    logging.basicConfig(
        stream=sys.stdout,
        level=args.loglevel.upper(),
        format="%(asctime)s [%(process)d] [%(levelname)s] %(message)s",
    )

    sysguard = SysGuard(load_limit=args.load_limit,
                        cpu_limit=args.cpu_limit,
                        mem_limit=args.mem_limit,
                        lock_file=args.lock_file,
                        debug=args.debug)
    if args.continuous:
        sysguard(args.interval)
    else:
        sysguard.check()


if __name__ == "__main__":
    main()
